1 The data

data <- readRDS(file="../data/sub_portugal_5provs_4blocks.rds") %>% 
  mutate_at(c("block", "prov", "clon", "tree"), as.factor) %>% # formatting data
  mutate(age.sc = as.vector(scale(age, center = F))) # as age is definite on R+ I would only reduce it..
  # mutate(age.sc = as.vector(scale(age))) # mean centering age

The dataset includes:

  • Height data from a provenance trial (in Portugal) of maritime pine saplings.
  • Randomized block design. Here I selected 5 provenances and 4 blocks.
  • Saplings have different ages: 11, 15, 20 and 27 month old.
table(data$prov,data$block) %>% kable(caption = "Provenance against block number.")
Provenance against block number.
34 35 36 38
LEI 78 71 72 82
MIM 44 45 60 60
PIE 26 29 34 34
SAC 21 20 18 21
VAL 37 43 42 42
table(data$prov,as.factor(data$age)) %>% kable(caption = "Provenance against age (in months).")
Provenance against age (in months).
11 15 20 27
LEI 92 84 65 62
MIM 72 59 40 38
PIE 36 33 27 27
SAC 23 23 17 17
VAL 48 44 37 35
ggplot(data, aes(x=height)) + 
  geom_histogram(color="darkblue", fill="lightblue") + 
  theme_bw()
Height versus age.

Height versus age.

ggplot(data, aes(x=height, color=as.factor(age))) + 
  geom_histogram(fill="white", alpha=0.5, position="identity") + 
  theme_bw()  +
  facet_wrap(~as.factor(age)) + 
  theme(legend.position = "none")
Height versus age.

Height versus age.

plot_grid(ggplot(data, aes(x=age,y=height)) + 
            geom_point(alpha=0.2) + 
            stat_smooth(method = "lm", col = "red") + 
            theme_bw()  +
            theme(axis.title=element_text(size=16)),
          ggplot(data, aes(x=age,y=height)) + 
            geom_point(alpha=0.2) + 
            stat_smooth(method = "lm", col = "red", formula = y~poly(x,2)) + 
            theme_bw() +
            theme(axis.title=element_text(size=16)))
Height versus age.

Height versus age.

plot_grid(ggplot(data, aes(x=age,y=log(height))) + 
            geom_point(alpha=0.2) + 
            stat_smooth(method = "lm", col = "red") + 
            theme_bw()  +
            theme(axis.title=element_text(size=16)),
          ggplot(data, aes(x=age,y=log(height))) + 
            geom_point(alpha=0.2) + 
            stat_smooth(method = "lm", col = "red", formula = y~poly(x,2)) + 
            theme_bw() +
            theme(axis.title=element_text(size=16)))
Height versus age.

Height versus age.

ggplot(data, aes(x=height, color=block)) +
  geom_histogram(fill="white", alpha=0.5, position="identity") + 
  theme_bw()  +
  facet_grid(prov ~block, labeller = label_both) + 
  theme(legend.position = "none")
Height distribution by Provenance and Block.

Height distribution by Provenance and Block.

2 “Fixed effects” models

Dummy variables for each level = Regularized intercepts, because we use weakly informative priors. But no information shared between intercepts. (See P299 in Statistical Rethinking of McElreath)

First, we are going to try three different likehoods: the normal distribution, the normal distribution with a log-transformed response variable and a lognormal distribution.

2.1 Different distributions

2.1.1 Normal distribution mN

Mathematical model

\[\begin{equation} \begin{aligned} h_{i} & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{HalfCauchy}(0,25) \end{aligned} \end{equation}\]

Data in a list.

data.list <- list(N=length(data$height),              # Number of observations
                  y=data$height,                      # Response variables
                  age=data$age.sc,                    # Tree age
                  nprov=length(unique(data$prov)),    # Number of provenances
                  nblock=length(unique(data$block)),  # Number of blocks
                  prov=as.numeric(data$prov),         # Provenances
                  bloc=as.numeric(data$block))        # Blocks
mN = stan_model("mN.stan") 
fit.mN <- sampling(mN, data = data.list, iter = 2000, chains = 2, cores = 2) 
broom::tidyMCMC(fit.mN, 
                pars = c("beta_age","beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>% 
  kable(caption = "Model mN with a normal distribution")
Model mN with a normal distribution
term estimate std.error conf.low conf.high rhat ess
beta_age 137.17866 7.765476 122.445202 152.70549 0.9991825 1959
beta_age2 151.12115 6.269038 138.942113 163.08563 1.0003679 1886
alpha_prov[1] 45.92467 7.030658 31.954684 60.12776 1.0006641 2118
alpha_prov[2] 30.10218 7.519204 15.621609 44.92392 0.9990958 2367
alpha_prov[3] 12.28498 8.350689 -4.054990 28.57003 0.9997337 2286
alpha_prov[4] 19.50809 8.736821 3.121270 36.17416 0.9992251 2435
alpha_prov[5] 25.32921 8.092449 9.272715 41.51663 1.0005580 1987
alpha_block[1] 21.70303 7.232428 7.651517 35.02408 0.9991084 2160
alpha_block[2] 26.80213 7.552256 12.028854 41.00510 0.9990695 2507
alpha_block[3] 35.52770 7.707398 19.910170 50.20886 1.0002847 2281
alpha_block[4] 49.63100 7.928611 34.064540 65.26942 0.9990736 2585
sigma_y 141.49786 3.616364 134.394562 148.62190 0.9997112 2777
lp__ -5048.53803 2.441098 -5054.499083 -5045.06559 1.0029609 798

Comparison of the distribution of \(y\) and the posterior predictive distributions (from yrep matrix).

ppc_dens_overlay(y = data$height,
                 as.matrix(fit.mN, pars = "y_rep")[1:50, ]) + 
  theme_bw() + 
  theme(legend.text=element_text(size=25),
        legend.title=element_text(size=18),
        axis.text = element_text(size=18),
        legend.position = c(0.8,0.6))

2.1.2 Normal distribution with log(y) mNlogy

Mathematical model

\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{HalfCauchy}(0,25) \end{aligned} \end{equation}\]
data.list.mNlogy <- list(N=length(data$height),              # Number of observations
                  y=log(data$height),                      # log(response variable)
                  age=data$age.sc,                         # Tree age
                  nprov=length(unique(data$prov)),         # Number of provenances
                  nblock=length(unique(data$block)),       # Number of blocks
                  prov=as.numeric(data$prov),              # Provenances
                  bloc=as.numeric(data$block))             # Blocks

Same stan code as the previous model!

mNlogy = stan_model("mNlogy.stan")
fit.mNlogy <- sampling(mNlogy, data = data.list.mNlogy, iter = 2000, chains = 2, 
                       control=list(max_treedepth=14), cores = 2, save_warmup = F)
broom::tidyMCMC(fit.mNlogy, 
                pars = c("beta_age", "beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>% 
  kable(caption = "Model mNlogy with a normal distribution and a log-transformed response variable")
Model mNlogy with a normal distribution and a log-transformed response variable
term estimate std.error conf.low conf.high rhat ess
beta_age 3.0590213 0.3540056 2.3886657 3.7368434 0.9993254 625
beta_age2 -0.8529809 0.1703165 -1.1809340 -0.5366599 0.9991601 598
alpha_prov[1] 1.4173513 3.3803650 -5.3612399 8.1904353 1.0028008 234
alpha_prov[2] 1.3639220 3.3799601 -5.4099302 8.1415919 1.0028224 234
alpha_prov[3] 1.2581216 3.3792558 -5.4902359 8.0397761 1.0027840 235
alpha_prov[4] 1.3757216 3.3806710 -5.3966052 8.1362501 1.0027976 234
alpha_prov[5] 1.3735124 3.3787015 -5.3767142 8.1289650 1.0028222 234
alpha_block[1] 2.2812688 3.3770891 -4.5192376 9.1881944 1.0027868 233
alpha_block[2] 2.3041504 3.3761641 -4.4665490 9.2467735 1.0027215 233
alpha_block[3] 2.3324488 3.3767774 -4.4769462 9.2174757 1.0027396 233
alpha_block[4] 2.4199726 3.3777959 -4.3854048 9.3615308 1.0027444 233
sigma_y 0.3970389 0.0097395 0.3786201 0.4166466 1.0022870 734
lp__ 372.5172291 2.4389120 366.8161804 375.9237097 1.0024838 595
  • There may be a warning about effective sample size. This warning should disappear if we increase the number of iterations to 2,500.

  • We have to increase the maximum treedepth: See the Brief Guide to Stan’s Warnings

  • lp has largely increased.

Comparison of the distribution of \(y\) and the posterior predictive distributions (from yrep matrix).

y_rep <- as.matrix(fit.mNlogy, pars = "y_rep")
ppc_dens_overlay(y =log(data$height),
                 as.matrix(fit.mNlogy, pars = "y_rep")[1:50, ]) + 
  theme_bw() + 
  theme(legend.text=element_text(size=25), legend.title=element_text(size=18), 
        axis.text = element_text(size=18), legend.position = c(0.8,0.6))

A better fit than mN.

2.1.3 LogNormal distribution mLogNR

Mathematical model

\[\begin{equation} \begin{aligned} h_{i} & \sim \text{LogNormal}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{HalfCauchy}(0,25) \end{aligned} \end{equation}\]

We are going to use the same data list as in the first model mN.

mLogNR = stan_model("mLogNR.stan") 
fit.mLogNR <- sampling(mLogNR, data = data.list, iter = 2000, 
                       chains = 2, control=list(max_treedepth=14), save_warmup = F)
broom::tidyMCMC(fit.mLogNR, 
                pars = c("beta_age", "beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>% 
  kable(caption = "Model mLogNR with a LogNormal distribution on R")
Model mLogNR with a LogNormal distribution on R
term estimate std.error conf.low conf.high rhat ess
beta_age 3.0307614 0.3328479 2.3505536 3.7293303 1.0048069 533
beta_age2 -0.8434647 0.1591522 -1.1791151 -0.5168304 1.0046194 538
alpha_prov[1] 1.1458945 3.3561201 -4.7535557 7.9004600 1.0105964 267
alpha_prov[2] 1.1003799 3.3568042 -4.7527428 7.8778128 1.0106404 266
alpha_prov[3] 0.9924494 3.3573113 -4.8992699 7.7765604 1.0105778 266
alpha_prov[4] 1.1016616 3.3574769 -4.7598206 7.8870408 1.0105980 267
alpha_prov[5] 1.1038531 3.3570309 -4.7782228 7.8661618 1.0106182 266
alpha_block[1] 2.5720399 3.3659581 -4.2569123 8.3785850 1.0110725 265
alpha_block[2] 2.6195725 3.3662227 -4.2951219 8.4114883 1.0110646 265
alpha_block[3] 2.6304366 3.3661878 -4.2271154 8.4196408 1.0111538 265
alpha_block[4] 2.7234038 3.3667849 -4.1140769 8.5167146 1.0111019 265
sigma_y 0.3966131 0.0094509 0.3789919 0.4163500 0.9999027 819
lp__ 372.6195200 2.4430696 366.8167358 376.1387599 1.0080040 611

Comparison of the distribution of \(y\) and the posterior predictive distributions (from yrep matrix).

ppc_dens_overlay(y = data$height,
                 as.matrix(fit.mLogNR, pars = "y_rep")[1:50, ])  + 
  theme_bw() + 
  theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
        axis.text = element_text(size=18), legend.position = c(0.8,0.6))

2.2 More informative priors

In the previous models, our priors are very weakly informative. We can use a little more informative priors, that can help convergence and decrease the running time. Belox, we refit the model mNlogy (normal distribution with log(y)) with more informative priors.

Variance priors

Prior recommendations for scale parameters in hierarchical models

  • Very weakly informative prior: \(\sigma \sim \text{HalfCauchy}(0,25)\). From Gelman (2006): 8-schools example (p430). And here.

  • Weakly informative prior:

    • \(\sigma \sim \text{HalfCauchy}(0,1)\) (McElreath, First version) \(\sigma \sim \text{HalfCauchy}(0,5)\) (Betancourt in 8-schools example)

    • \(\sigma \sim \text{exponential}(1)\) (McElreath, Second version) or \(\sigma \sim \text{exponential}(0.1)\)

    • \(\sigma \sim \text{LogNormal}(0,1)\) (McElreath, Second version)

  • More informative prior : \(\sigma \sim \text{HalfNormal}(0,1)\) or \(\sigma \sim \text{Half-t}(3,0,1)\)

Beta priors

  • More informative priors: \(\beta \sim \mathcal{N}(0,1)\).

  • We can also consider that \(age\) is positively associated with height so we can add some information in our model and constrain \(\beta_{age}\) to positive values, such as: \(\beta_{age} \sim \text{LogNormal}(0,1)\) or \(\beta_{age} \sim \text{Gamma}(?,?)\).

2.2.1 Exponential sigma mNlogySigmaPrior

Mathematical model

\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]
mNlogySigmaPrior = stan_model("mNlogySigmaPrior.stan")
fit.mNlogySigmaPrior <- sampling(mNlogySigmaPrior, data = data.list.mNlogy, iter = 2000, chains = 2, cores = 2,
                        control=list(max_treedepth=14), save_warmup = F)
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#tail-ess
broom::tidyMCMC(fit.mNlogySigmaPrior, 
                pars = c("beta_age", "beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>% 
  kable(caption = "Model mNlogySigmaPrior with a normal distribution and a log-transformed response variable")
Model mNlogySigmaPrior with a normal distribution and a log-transformed response variable
term estimate std.error conf.low conf.high rhat ess
beta_age 3.0853939 0.3115853 2.5005104 3.6681742 1.002025 416
beta_age2 -0.8681602 0.1489561 -1.1516606 -0.5807643 1.001894 415
alpha_prov[1] 1.7237782 3.3001236 -5.1851552 8.1666440 1.000452 278
alpha_prov[2] 1.6744454 3.3009885 -5.2144130 8.0998992 1.000447 278
alpha_prov[3] 1.5646613 3.2997657 -5.3127174 7.9979732 1.000442 278
alpha_prov[4] 1.7134032 3.2999423 -5.1963587 8.1239492 1.000428 278
alpha_prov[5] 1.6902387 3.2998158 -5.2190709 8.1101379 1.000448 278
alpha_block[1] 1.9336633 3.3087644 -4.3830387 8.9411582 1.000624 279
alpha_block[2] 1.9742946 3.3093718 -4.3784910 8.9704145 1.000629 279
alpha_block[3] 1.9966810 3.3086366 -4.3427997 8.9796271 1.000621 278
alpha_block[4] 2.0963133 3.3092574 -4.2326275 9.1269228 1.000629 278
sigma_y 0.3961707 0.0091660 0.3794562 0.4156804 1.002524 554
lp__ 372.4438071 2.3761946 366.6220138 375.7898736 1.000649 664

Comparison of the distribution of \(y\) and the posterior predictive distributions (from yrep matrix).

ppc_dens_overlay(y =log(data$height),
                 as.matrix(fit.mNlogySigmaPrior, pars = "y_rep")[1:50, ]) + 
  theme_bw() + 
  theme(legend.text=element_text(size=25), legend.title=element_text(size=18), 
        axis.text = element_text(size=18), legend.position = c(0.8,0.6))

2.2.2 More informative intercepts and slopes mNlogyBetaAlphaPrior

Mathematical model

\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,1)\\ \alpha_{PROV} & \sim \mathcal{N}(0,1)\\ \beta_{age} & \sim \mathcal{N}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]
mNlogyBetaAlphaPrior = stan_model("mNlogyBetaAlphaPrior.stan")
fit.mNlogyBetaAlphaPrior <- sampling(mNlogyBetaAlphaPrior, data = data.list.mNlogy, iter = 2000, chains = 2, cores = 2,
                        control=list(max_treedepth=14), save_warmup = F)
broom::tidyMCMC(fit.mNlogyBetaAlphaPrior, 
                pars = c("beta_age", "beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>% 
  kable(caption = "Model mNlogyBetaAlphaPrior with a normal distribution and a log-transformed response variable")
Model mNlogyBetaAlphaPrior with a normal distribution and a log-transformed response variable
term estimate std.error conf.low conf.high rhat ess
beta_age 3.1004531 0.2912053 2.4961041 3.6821798 1.002079 451
beta_age2 -0.8746115 0.1396886 -1.1381178 -0.5838175 1.002189 460
alpha_prov[1] 1.7130004 0.3515719 1.0019500 2.3930041 1.006129 217
alpha_prov[2] 1.6523240 0.3509656 0.9518173 2.3356440 1.006158 213
alpha_prov[3] 1.5548197 0.3501575 0.8462078 2.2350552 1.005868 214
alpha_prov[4] 1.6611466 0.3527107 0.9585417 2.3443441 1.006805 210
alpha_prov[5] 1.6611592 0.3514362 0.9584232 2.3594503 1.006014 216
alpha_block[1] 1.9893679 0.3521466 1.2864968 2.6990990 1.005293 206
alpha_block[2] 2.0245532 0.3527592 1.3196379 2.7259085 1.004878 206
alpha_block[3] 2.0361805 0.3519823 1.3313723 2.7511280 1.005202 204
alpha_block[4] 2.1429670 0.3528397 1.4271725 2.8361540 1.005711 211
sigma_y 0.3961433 0.0095731 0.3786008 0.4148985 1.000449 956
lp__ 351.9985186 2.5645317 345.5261730 355.5557700 1.000178 584

Comparison of the distribution of \(y\) and the posterior predictive distributions (from yrep matrix).

ppc_dens_overlay(y =log(data$height),
                 as.matrix(fit.mNlogyBetaAlphaPrior, pars = "y_rep")[1:50, ]) + 
  theme_bw() + 
  theme(legend.text=element_text(size=25), legend.title=element_text(size=18), 
        axis.text = element_text(size=18), legend.position = c(0.8,0.6))

Comment: The intercept parameters change between the two models (mNlogyBetaAlphaPrior and mNlogySigmaPrior).

2.3 Vectorized models

We use again the model mNlogy (normal distribution with log(y) and very weakly informative priors)

Mathematical model

\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{HalfCauchy}(0,25) \end{aligned} \end{equation}\]

2.3.1 Option 1

mNlogyVectorize1 = stan_model("mNlogyVectorize1.stan") 
## recompiling to avoid crashing R session
fit.mNlogyVectorize1 <- sampling(mNlogyVectorize1, data = data.list.mNlogy, iter = 2000,
                        control=list(max_treedepth=14),
                                chains = 2, cores = 2, save_warmup = F)
broom::tidyMCMC(fit.mNlogyVectorize1, 
                pars = c("beta_age", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>% 
  kable(caption = "Model mNlogyVectorize1 with a normal distribution and log(y)")
Model mNlogyVectorize1 with a normal distribution and log(y)
term estimate std.error conf.low conf.high rhat ess
beta_age 3.0600624 0.3334058 2.3789984 3.7389229 1.0007327 403
alpha_prov[1] 1.5700576 3.3000869 -4.6634773 8.1686080 1.0024234 249
alpha_prov[2] 1.4982032 3.2998967 -4.7048100 8.1006768 1.0024422 249
alpha_prov[3] 1.4208945 3.3009499 -4.8134431 8.0235527 1.0024733 249
alpha_prov[4] 1.5111111 3.3002768 -4.7393854 8.1096778 1.0024572 249
alpha_prov[5] 1.5234478 3.3000213 -4.6795873 8.1328925 1.0024566 249
alpha_block[1] 2.1617757 3.3119904 -4.3750590 8.3947399 1.0022910 251
alpha_block[2] 2.1939238 3.3122452 -4.3656572 8.4215537 1.0023326 251
alpha_block[3] 2.2109547 3.3125969 -4.3607901 8.4132818 1.0023518 251
alpha_block[4] 2.3044364 3.3119205 -4.2591138 8.5367959 1.0023170 251
sigma_y 0.3967513 0.0095748 0.3792346 0.4161848 1.0052156 464
lp__ 372.5540892 2.4441470 366.7206065 376.1479273 0.9996837 524
  • Still need to increase max_treedepth here !

2.3.2 Option 2

mNlogyVectorize2 = stan_model("mNlogyVectorize2.stan")
## recompiling to avoid crashing R session
fit.mNlogyVectorize2 <- sampling(mNlogyVectorize2, data = data.list.mNlogy, iter = 2000,
                        control=list(max_treedepth=14),
                                 chains = 2, cores = 2, save_warmup = F) 
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#tail-ess
broom::tidyMCMC(fit.mNlogyVectorize2, 
                pars = c("beta_age", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>% 
  kable(caption = "Model mNlogyVectorize2 with a normal distribution and log(y)")
Model mNlogyVectorize2 with a normal distribution and log(y)
term estimate std.error conf.low conf.high rhat ess
beta_age 0.4079271 0.0145606 0.3819086 0.4384895 1.0017849 1054
alpha_prov[1] 2.7583218 3.2298185 -3.6545509 9.6148244 1.0177140 179
alpha_prov[2] 2.7035882 3.2309829 -3.7566788 9.6054385 1.0177927 178
alpha_prov[3] 2.6038981 3.2317715 -3.8451136 9.5291288 1.0178303 178
alpha_prov[4] 2.7201621 3.2294372 -3.6954186 9.5867217 1.0176730 178
alpha_prov[5] 2.7280783 3.2315433 -3.6836257 9.5635812 1.0177782 178
alpha_block[1] 2.2278763 3.2309493 -4.6737940 8.6385089 1.0177150 179
alpha_block[2] 2.2390444 3.2315742 -4.6434048 8.6831788 1.0177958 179
alpha_block[3] 2.2673620 3.2303644 -4.6117778 8.6788860 1.0177729 178
alpha_block[4] 2.3610523 3.2312457 -4.5000585 8.7731852 1.0176530 179
sigma_y 0.4097010 0.0105303 0.3903519 0.4312985 1.0007078 999
lp__ -505.4200715 2.4391393 -511.3613291 -501.9341373 0.9996881 553

Let’s compare the speed of the 5 models with a normal distribution with log(y)

lapply(lapply(c(fit.mNlogy, fit.mNlogySigmaPrior,fit.mNlogyBetaAlphaPrior, fit.mNlogyVectorize1, fit.mNlogyVectorize2), 
       get_elapsed_time), data.frame) %>% 
  bind_rows(.id = "model") %>%
  mutate(model = recode_factor(model,
                               "1" = "Model mNlogy with $\\sigma \\sim \\text{HalfCauchy}(0,25)$",
                               "2" = "Model mNlogySigmaPrior with $\\sigma \\sim \\text{Exponential}(1)$",
                               "3" = "Model mNlogyBetaAlphaPrior with $\\beta$ and $\\alpha  \\sim \\mathcal{N}(0,1)$ ",
                               "4" = "Model mNlogyVectorize1",
                               "5" = "Model mNlogyVectorize2")) %>% 
  mutate(total = warmup + sample) %>% 
  arrange(total) %>% 
  kable(caption = "Model speed comparison of models with a normal distribution and a log-transformed response variable.")
Model speed comparison of models with a normal distribution and a log-transformed response variable.
model warmup sample total
Model mNlogyBetaAlphaPrior with \(\beta\) and \(\alpha \sim \mathcal{N}(0,1)\) 18.3864 22.5171 40.9035
Model mNlogyVectorize1 20.4983 20.7858 41.2841
Model mNlogyBetaAlphaPrior with \(\beta\) and \(\alpha \sim \mathcal{N}(0,1)\) 19.2170 23.9824 43.1994
Model mNlogyVectorize1 19.9800 23.3768 43.3568
Model mNlogyVectorize2 21.5884 23.1175 44.7059
Model mNlogyVectorize2 24.0733 26.2528 50.3261
Model mNlogy with \(\sigma \sim \text{HalfCauchy}(0,25)\) 91.4763 100.6600 192.1363
Model mNlogySigmaPrior with \(\sigma \sim \text{Exponential}(1)\) 91.1304 111.7290 202.8594
Model mNlogy with \(\sigma \sim \text{HalfCauchy}(0,25)\) 84.9207 119.5970 204.5177
Model mNlogySigmaPrior with \(\sigma \sim \text{Exponential}(1)\) 97.5780 107.6160 205.1940

For the subsequent models, we will use the more informative priors of mNlogyBetaAlphaPrior.

3 Multilevel models with one-varying intercept

Adaptive regularization

Links:

We are going to

3.1 Normal distribution with log(y)

3.1.1 Centered parameterization mmNlogyC

P357 McElreath (first version).

mmNlogyC (multi-level model with normal distribution and log(y) - centered paramerization)

Mathematical model

\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{PROV[p]}\\ \alpha_{PROV} & \sim \mathcal{N}(\mu_{\alpha_{PROV}},\sigma_{\alpha_{PROV}})\\ \mu_{\alpha_{PROV}} & \sim \mathcal{N}(0,1)\\ \sigma_{\alpha_{PROV}}& \sim \text{Exponential}(1)\\ \beta_{age} & \sim \mathcal{N}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]
data.list.reduced <- list(N=length(data$height),         # Number of observations
                  y=log(data$height),                    # Log(Response variable)
                  age=data$age.sc,                       # Tree age
                  nprov=length(unique(data$prov)),       # Number of provenances
                  prov=as.numeric(data$prov))            # Provenances
mmNlogyC <- stan_model("mmNlogyC.stan")  
fit.mmNlogyC <- sampling(mmNlogyC, data = data.list.reduced , iter = 2000, chains = 2, 
                                 cores = 2, control=list(max_treedepth=14), save_warmup = F)
## Warning: There were 2 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Warning: Examine the pairs() plot to diagnose sampling problems
broom::tidyMCMC(fit.mmNlogyC,
                pars = c("beta_age", "beta_age2", "mean_alpha_prov", "sigma_alpha_prov",
                         "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
  kable(caption = "One-varying intercept model mmNlogyC with centered parameterization")
One-varying intercept model mmNlogyC with centered parameterization
term estimate std.error conf.low conf.high rhat ess
beta_age 2.8828267 0.3336757 2.2554695 3.5934452 1.009319 215
beta_age2 -0.7649118 0.1590296 -1.1015075 -0.4599303 1.008718 225
mean_alpha_prov 3.7947163 0.1667593 3.4459703 4.1223929 1.009050 245
sigma_alpha_prov 0.0589324 0.0520721 0.0169773 0.2208421 1.013428 336
sigma_y 0.4003032 0.0095925 0.3816535 0.4189197 1.001460 882
lp__ 362.8139984 2.5665427 356.7280601 366.4053299 1.000720 420

This model has some divergent transitions, low number of effective sample size and R-hat not exactly equal to one.

Divergent transitions

  • To avoid divergent transitions, we can increase adapt_delta..

McEleath (Second version) : “[…] the target acceptance rate is controlled by the adapt_delta control parameter. The default is 0.95, which means that it aims to attain a 95% acceptance rate. It tries this during the warmup phase, adjusting the step size of each leapfrog step (go back to Chapter 9 if these terms aren’t familiar). When adapt_delta is set high, it results in a smaller step size, which means a more accurate approximation of the curved surface. It also means more computation, which means a slower chain. Increasing adapt_delta can often, but not always, help with divergent transitions.”

  • You can also try to add some information in your model, for instance with \(\beta_{age} \sim \text{LogNormal}(0,1)\).

  • Or you can reparameterize your model with the non-centered parameterization. (Best option!!)

ppc_dens_overlay(y = log(data$height),
                 as.matrix(fit.mmNlogyC, pars = "y_rep")[1:50, ]) + 
  theme_bw() +
  theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
        axis.text = element_text(size=18), legend.position = c(0.8,0.6))

mcmc_trace(as.array(fit.mmNlogyC), pars = c("alpha_prov[3]","sigma_alpha_prov"), 
           np = nuts_params(fit.mmNlogyC)) + 
  xlab("Post-warmup iteration")

mcmc_pairs(as.array(fit.mmNlogyC), np = nuts_params(fit.mmNlogyC), 
           pars = c("beta_age","beta_age2","mean_alpha_prov","sigma_alpha_prov","alpha_prov[3]","sigma_y"), 
           off_diag_args = list(size = 1, alpha = 1/3),
           np_style = pairs_style_np(div_size=2, div_shape = 19))

3.1.2 Non-centered parameterization mmNlogyNC

Links:


From McElreath, P429 (13.4.2.) of Statistical Rethinking (second version)

\[ \alpha \sim \mathcal{N}(\mu,\sigma)\]

is equivalent to

\[\begin{equation} \begin{aligned} \alpha &= \mu + \beta\\ \beta &\sim \mathcal{N}(0,\sigma) \end{aligned} \end{equation}\]

is equivalent to

\[\begin{equation} \begin{aligned} \alpha &= \mu + z\sigma\\ z &\sim \mathcal{N}(0,1) \end{aligned} \end{equation}\]

No parameters are left inside the prior.


From Updating: A Set of Bayesian Notes. Jeffrey B. Arnold. 20 Multilevel Models

These are two ways of writing the same model. However, they change the parameters that the HMC algorithm is actively sampling and thus can have different sampling performance.

However, neither is universally better.

  • If much data, the non-centered parameterization works better
  • If less data, the centered parameterization works better

And there is currently no ex-ante way to know which will work better, and at what amount of “data” that the performance of one or the other is better. However, one other reason to use the centered parameterization (if it is also scaled), is that the Stan HMC implementation tends to be more efficient if all parameters are on the scale.


mmNlogyNC (multi-level model with normal distribution and log(y) - non-centered paramerization)

Mathematical model

\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(log(\mu_{i}),\sigma_{i})\\ \mu_{i} & = \alpha + \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + z_{PROV[p]}\sigma_{PROV}\\ \alpha & \sim \mathcal{N}(0,1)\\ z_{PROV} & \sim \mathcal{N}(0,1)\\ \sigma_{\alpha_{PROV}}& \sim \text{Exponential}(1)\\ \beta_{age} & \sim \mathcal{N}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]
mmNlogyNC <- stan_model("mmNlogyNC.stan")
fit.mmNlogyNC <- sampling(mmNlogyNC, data = data.list.reduced, iter = 2000, chains = 2,
                          cores = 2, control=list(max_treedepth=14), save_warmup = F)
broom::tidyMCMC(fit.mmNlogyNC,
                pars = c("beta_age", "beta_age2", "z_prov", "sigma_prov","alpha",
                         "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
  kable(caption = "One-varying intercept model mmNlogyNC with non-centered parameterization")
One-varying intercept model mmNlogyNC with non-centered parameterization
term estimate std.error conf.low conf.high rhat ess
beta_age 2.9149973 0.2957180 2.3534367 3.5210070 1.0001809 702
beta_age2 -0.7827389 0.1418270 -1.0677224 -0.5123850 1.0001225 723
z_prov[1] 0.8092344 0.6503304 -0.3516886 2.1785367 0.9997079 972
z_prov[2] 0.1130628 0.6218723 -1.1537892 1.2977990 0.9992637 980
z_prov[3] -1.0182242 0.7372131 -2.5842314 0.2909131 1.0001025 855
z_prov[4] 0.1585837 0.7259417 -1.2876329 1.5393172 0.9991521 1350
z_prov[5] 0.1471047 0.6331136 -1.0652745 1.4573545 0.9996255 1269
sigma_prov 0.0582236 0.0545279 0.0114027 0.1964875 1.0035889 491
alpha 3.7805073 0.1447384 3.4869954 4.0582316 0.9997202 725
sigma_y 0.3994355 0.0096618 0.3820791 0.4184778 1.0003311 1688
lp__ 348.6348255 3.0093531 341.4264827 353.0039259 1.0006348 392

No more divergent transitions! But still low effective sample size.

ppc_dens_overlay(y = log(data$height),
                 as.matrix(fit.mmNlogyNC, pars = "y_rep")[1:50, ]) +
  theme_bw() + 
  theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
        axis.text = element_text(size=18), legend.position = c(0.8,0.6))

mcmc_trace(as.array(fit.mmNlogyNC), 
           pars =c( "z_prov[3]","sigma_prov"), 
           np = nuts_params(fit.mmNlogyNC)) + 
  xlab("Post-warmup iteration")
## No divergences to plot.

mcmc_pairs(as.array(fit.mmNlogyNC), np = nuts_params(fit.mmNlogyNC),
           pars = c("beta_age","beta_age2","alpha","sigma_prov","z_prov[3]","sigma_y"), 
           off_diag_args = list(size = 1, alpha = 1/3),
           np_style = pairs_style_np(div_size=3, div_shape = 19))

3.2 Lognormal distribution

3.2.1 Non-centered parameterization mmLogNnc


For a lognormal distribution the non-centered parameterisation is different:

\[\alpha \sim \mathcal{logN}(log(\mu),\sigma)\]

is equivalent to

\[\begin{equation} \begin{aligned} \alpha &= e^{log(\mu) + z.\sigma} \\ z &\sim \mathcal{N}(0,1) \end{aligned} \end{equation}\]

mmLogNnc (multi-level LogNormal dsitribution - non-centered paramerization)

Mathematical model

\[\begin{equation} \begin{aligned} h_{i} & \sim \text{LogNormal}(log(\mu_{i}),\sigma_{i})\\ \mu_{i} & = e^{log(\alpha) + z_{PROV[p]}\sigma_{PROV}} + \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} \\ \alpha & \sim \text{LogNormal}(0,1) \\ \beta_{age} & \sim \mathcal{N}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ z_{PROV} & \sim \mathcal{N}(0,1)\\ \sigma_{PROV} & \sim \text{Exponential}(1)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]
data.list.reduced.logN <- list(N=length(data$height),         # Number of observations
                  y=data$height,                              # Response variable
                  age=data$age.sc,                            # Tree age
                  nprov=length(unique(data$prov)),            # Number of provenances
                  prov=as.numeric(data$prov))                 # Provenances
mmLogNnc<- stan_model("mmLogNnc.stan")
fit.mmLogNnc <- sampling(mmLogNnc, data = data.list.reduced.logN, iter = 2000, chains = 2,
                          cores = 2, control=list(max_treedepth=14), save_warmup = F)
## Warning: There were 2 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Warning: Examine the pairs() plot to diagnose sampling problems
broom::tidyMCMC(fit.mmLogNnc,
                pars = c("beta_age", "beta_age2", "z_prov", "sigma_prov","alpha",
                         "sigma_y", "lp__"),
                droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
  kable(caption = "One-varying intercept model mmLogNnc with Lognormal non-centered parameterization")
One-varying intercept model mmLogNnc with Lognormal non-centered parameterization
term estimate std.error conf.low conf.high rhat ess
beta_age 1.0779235 1.0213755 -0.9255132 3.084179 0.9997807 1714
beta_age2 2.2395739 1.0179360 0.2869805 4.208102 0.9995171 1721
z_prov[1] 0.6995530 0.7315178 -0.7943672 2.082789 0.9992234 1026
z_prov[2] -0.2425250 0.7762592 -1.8652479 1.309054 1.0003687 1270
z_prov[3] -0.5271000 0.8231962 -2.2318216 1.068830 1.0058908 1297
z_prov[4] 0.1726138 0.8260796 -1.4574600 1.793651 1.0007149 1462
z_prov[5] 0.2978564 0.7677986 -1.3186946 1.735952 1.0008190 1483
sigma_prov 0.0493763 0.0848143 0.0023718 0.246020 1.0027449 314
alpha 314.6862899 17.5512995 285.3169758 337.572033 1.0088476 263
sigma_y 0.5778931 0.0135961 0.5520952 0.605530 0.9990399 1557
lp__ 16.0071436 2.9141755 9.2460113 20.432407 0.9998380 399
ppc_dens_overlay(y = data$height,
                 as.matrix(fit.mmLogNnc, pars = "y_rep")[1:50, ]) +
  theme_bw() + 
  theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
        axis.text = element_text(size=18), legend.position = c(0.8,0.6))

mcmc_trace(as.array(fit.mmLogNnc), 
           pars =c( "z_prov[3]","sigma_prov"), 
           np = nuts_params(fit.mmLogNnc)) + 
  xlab("Post-warmup iteration")

mcmc_pairs(as.array(fit.mmLogNnc), np = nuts_params(fit.mmLogNnc),
           pars = c("beta_age","beta_age2","alpha","sigma_prov","z_prov[3]","sigma_y"), 
           off_diag_args = list(size = 1, alpha = 1/3),
           np_style = pairs_style_np(div_size=3, div_shape = 19))